Business Question

The CEO of a Health Insurance company wants to expand the business by offering Car Insurance. To do this, he asked his employees to call some customers and ask if they would buy Car insurance if the company offered it. Through this survey, they obtained a large database containing customer characteristics and their survey responses.

Now, the company is ready to launch the new service, and the sales team have a list of 127,000 customers to make phone calls and offer the new Insurance. The company only has the resourses to call 20,000 customers, so he hired a Data Scientist to study those customers and define the 20,000 with best chances of buying the Insurance.

Challenge

Build a model that classifies a list of customers, informing the customer's predisposition to take out auto insurance or not. With this solution, the sales team hopes to be able to prioritize people with the greatest interest in the new product and thus optimize the campaign by only making contacts with the customers most likely to make the purchase.

0.0. Imports¶

In [1]:
import optuna
import pickle
import psycopg2
import warnings

import pandas             as pd
import numpy              as np
import seaborn            as sns
import scikitplot         as skplt
import lightgbm           as lgb
import matplotlib.pyplot  as plt
import matplotlib.patches as mpatches

from scipy                   import stats
from sqlalchemy.sql          import text
from IPython.display         import HTML
from sqlalchemy              import create_engine
from dataprep.eda            import create_report

from sklearn.linear_model    import LogisticRegression
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.ensemble        import RandomForestClassifier
from sklearn.ensemble        import ExtraTreesClassifier
from xgboost                 import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics         import confusion_matrix
from sklearn.metrics         import recall_score
from sklearn.metrics         import precision_score
from sklearn.metrics         import f1_score
from sklearn.metrics         import roc_curve
from sklearn.metrics         import roc_auc_score
from sklearn.utils           import class_weight
from sklearn.preprocessing   import StandardScaler
from sklearn.preprocessing   import MinMaxScaler

warnings.filterwarnings('ignore')

0.1. Helper Functions¶

In [2]:
def jupyter_settings():
    %matplotlib inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )


    sns.set()

def cramer_v(x, y):
    cm = pd.crosstab(x, y).values
    n = cm.sum()
    r, k = cm.shape
    chi2 = stats.chi2_contingency( cm )[0]
    chi2corr = max(0, chi2 - (k - 1) * (r - 1) / (n - 1) )
    kcorr = k - (k - 1) ** 2 / (n - 1)
    rcorr = r - (r - 1) ** 2 / (n - 1)
    return np.sqrt((chi2corr / n) / ( min(kcorr - 1, rcorr - 1 )))
    
def cross_validation(X, y, model,model_name = "Model", test_size = 0.3, cv = 10, top_k = 2000, verbose = True):
    
    list_scores = []
    
    for i in range(1, cv + 1):
        X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = test_size )

        # Preprocess data
        X_train, X_test, y_train, y_test = preprocessing(X_train, X_test, y_train, y_test)

        # Model training
        model = model.fit(X_train, y_train) 

        # Model evaluation   
        scores = print_scores(y_test = y_test,
                              y_pred  = model.predict(X_test), 
                              predict_proba = model.predict_proba(X_test),
                              top_k = top_k,
                              verbose = verbose)
        
        list_scores.append(scores)
    
    list_scores = np.array(list_scores).transpose()
        
    data_frame = pd.DataFrame( {'Model Name': model_name,
                                'Precision Top-K CV': np.round( np.mean( list_scores[0] ), 2 ).astype(str ) + ' +/- ' + np.round( np.std( list_scores[0] ), 4 ).astype( str ),
                                'Recall Top-K CV': np.round( np.mean( list_scores[1] ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( list_scores[1] ), 4 ).astype( str ),
                                'F1 Score CV': np.round( np.mean( list_scores[2] ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( list_scores[2] ), 4 ).astype( str ),
                                'AUC Score CV': np.round( np.mean( list_scores[3] ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( list_scores[3] ), 4 ).astype( str )}, index=[0] )
    
        
    return data_frame
    
    
def plot_curves(y_test, predict_proba, verbose):
    fpr, tpr, _ = roc_curve(y_test.values,  predict_proba[:,1])

    auc = np.round(roc_auc_score(y_test, predict_proba[::,1]),3)
    auc = auc
    
    
    if verbose == True:
#         plt.figure(figsize = (4, 4))
#         plt.plot(fpr,tpr)
#         plt.ylabel('True Positive Rate')
#         plt.xlabel('False Positive Rate')
#         plt.title('ROC Curve  |  AUC Score: ' + str( auc ))
        
        plt.figure(figsize = (4, 4))
        skplt.metrics.plot_cumulative_gain(y_test, predict_proba, figsize = (4,4))
        
    return auc

def score_top_k(y_test, predict_proba, top_k):
    df = pd.DataFrame()
    df['predictions'] = predict_proba[:,1]
    df['y_test'] = y_test.values
    df = df.sort_values('predictions', ascending = False).reset_index(drop = True).reset_index()
    df['score'] = df.apply(lambda x: 1 if ( x['index'] <= top_k and x['y_test'] == 1) else 0, axis = 1 )
    precision = df['score'].sum() / top_k
    recall = df['score'].sum() / df['y_test'].sum()
    return {'precision': precision, 'recall': recall}


    
def print_scores(y_test, y_pred, predict_proba, top_k,verbose):
    
    precision = np.round(precision_score(y_test, y_pred, pos_label = 1),3)
    recall = np.round(recall_score   (y_test, y_pred, pos_label = 1),3)
    f1 = np.round(f1_score   (y_test, y_pred, pos_label = 1),3)
    auc = plot_curves(y_test, predict_proba, verbose)
    precision_top_k = score_top_k(y_test, predict_proba, top_k)['precision']
    recall_top_k = score_top_k(y_test, predict_proba, top_k)['recall']
    
    if verbose == True:
        
        print("Precision top K: ", precision_top_k, " | Recall Top K: ", recall_top_k, f1, " | AUC Score: ", auc)
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        
    
    scores = [precision_top_k, recall_top_k, f1, auc]
    
    return scores

def preprocessing(X_train, X_test, y_train, y_test):   
    
    # Transformation
    
    # Gender
    X_train['gender'] = X_train['gender'].apply(lambda x: 0 if x == 'Male' else 1)
    X_test['gender'] = X_test['gender'].apply(lambda x: 0 if x == 'Male' else 1)
    
    # Vehicle damage
    X_train['vehicle_damage'] = X_train['vehicle_damage'].apply(lambda x: 0 if x == 'No' else 1)
    X_test['vehicle_damage'] = X_test['vehicle_damage'].apply(lambda x: 0 if x == 'No' else 1)
    
    # Vehicle age
    X_train['vehicle_age'] = X_train['vehicle_age'].apply(lambda x: 0 if x == '< 1 Year' else 1 if x == '1-2 Year' else 2)
    X_test['vehicle_age'] = X_test['vehicle_age'].apply(lambda x: 0 if x == '< 1 Year' else 1 if x == '1-2 Year' else 2)
    
    # region_code - Frequency Encoding / Target Encoding / Weighted Target Encoding
    # Target Encoding
    aux = X_train.copy()
    aux['response'] = y_train
    target_encode_region_code = aux.groupby('region_code')['response'].mean()
    X_train['region_code'] = X_train['region_code'].map( target_encode_region_code )
    X_test['region_code'] = X_test['region_code'].map( target_encode_region_code )
    X_test['region_code'].fillna(0, inplace = True)
    
    # policy sales channel - Frequency Encoding / Target Encoding / Weighted Target Encoding
    # Frequency encoding
    target_encode_policy_sales = aux.groupby('policy_sales_channel')['response'].count() / len(insurance_data5)
    X_train['policy_sales_channel'] = X_train['policy_sales_channel'].map( target_encode_policy_sales )
    X_test['policy_sales_channel'] = X_test['policy_sales_channel'].map( target_encode_policy_sales )
    X_test['policy_sales_channel'].fillna(0, inplace = True)
    
    # Feature selection
    X_train = X_train[['age', 'region_code', 'policy_sales_channel', 'previously_insured', 'annual_premium', 'vintage', 'vehicle_damage']]    
    X_test = X_test[['age', 'region_code', 'policy_sales_channel', 'previously_insured', 'annual_premium', 'vintage', 'vehicle_damage']]    
    
    return X_train, X_test, y_train, y_test
    



jupyter_settings()

0.2. Loading Data¶

In [3]:
# # Engine to connect database
# engine = create_engine('postgresql+psycopg2://user:password@hostname/database_name')
# conn = engine.connect()

# query = '''
#         SELECT 
#               pu.id,
#               gender,
#               age,
#               region_code,
#               policy_sales_channel,
#               previously_insured,
#               annual_premium,
#               vintage,
#               response,
#               driving_license,
#               vehicle_age,
#               vehicle_damage
              
#         FROM
#               pa004.users pu
#               LEFT JOIN pa004.insurance pi ON (pi.id = pu.id)
#               LEFT JOIN pa004.vehicle pv ON (pi.id = pv.id);
# '''

# with conn.execution_options(autocommit=True) as conn:
#     query = conn.execute(text(query))

    
# df = pd.DataFrame(query.fetchall())
# df.columns = ['id','gender','age','region_code','policy_sales_channel','previously_insured',
#               'annual_premium','vintage','response','driving_license','vehicle_age','vehicle_damage']

# conn.close()
In [4]:
df = pd.read_csv('../data/insurance_data.csv')
In [5]:
df.head()
Out[5]:
id gender age region_code policy_sales_channel previously_insured annual_premium vintage response driving_license vehicle_age vehicle_damage
0 7 Male 23 11.0 152.0 0 23367.0 249 0 1 < 1 Year Yes
1 13 Female 41 15.0 14.0 1 31409.0 221 0 1 1-2 Year No
2 18 Female 25 35.0 152.0 1 46622.0 299 0 1 < 1 Year No
3 31 Female 26 8.0 160.0 0 2630.0 136 0 1 < 1 Year No
4 39 Male 45 8.0 124.0 0 42297.0 264 0 1 1-2 Year Yes

1.0. Data Description¶

In [6]:
insurance_data1 = df.copy()

1.1. Columns Description¶

1.2. Data Dimensions¶

In [7]:
insurance_data1.shape
Out[7]:
(381109, 12)

1.3. Data Types¶

In [8]:
insurance_data1.dtypes
Out[8]:
id                        int64
gender                   object
age                       int64
region_code             float64
policy_sales_channel    float64
previously_insured        int64
annual_premium          float64
vintage                   int64
response                  int64
driving_license           int64
vehicle_age              object
vehicle_damage           object
dtype: object

1.4. Check NA¶

In [9]:
insurance_data1.isna().sum()
Out[9]:
id                      0
gender                  0
age                     0
region_code             0
policy_sales_channel    0
previously_insured      0
annual_premium          0
vintage                 0
response                0
driving_license         0
vehicle_age             0
vehicle_damage          0
dtype: int64

1.5. Descriptive Statistics¶

1.5.1. Numerical Attributes¶

In [10]:
numerical_columns = ['annual_premium','policy_sales_channel', 'vintage','age']
In [11]:
data_statistics = insurance_data1[numerical_columns].describe().reset_index()
skew = pd.DataFrame(insurance_data1[numerical_columns].apply(lambda x: x.skew())).T
data_statistics = pd.concat([data_statistics, skew]).fillna('Skew')
kurtosis = pd.DataFrame(insurance_data1[numerical_columns].apply(lambda x: x.kurtosis())).T
data_statistics = pd.concat([data_statistics, kurtosis]).fillna('Kurtosis')
data_statistics.set_index('index')
data_statistics
Out[11]:
index annual_premium policy_sales_channel vintage age
0 count 381109.000000 381109.000000 381109.000000 381109.000000
1 mean 30564.389581 112.034295 154.347397 38.822584
2 std 17213.155057 54.203995 83.671304 15.511611
3 min 2630.000000 1.000000 10.000000 20.000000
4 25% 24405.000000 29.000000 82.000000 25.000000
5 50% 31669.000000 133.000000 154.000000 36.000000
6 75% 39400.000000 152.000000 227.000000 49.000000
7 max 540165.000000 163.000000 299.000000 85.000000
0 Skew 1.766087 -0.900008 0.003030 0.672539
0 Kurtosis 34.004569 -0.970810 -1.200688 -0.565655

1.5.2. Categorical Attributes¶

In [12]:
fig, axis = plt.subplots(figsize = (19, 12))

plt.subplot(2, 3, 1)
sns.barplot(x = insurance_data1['previously_insured'].value_counts().index.values, 
            y = insurance_data1['previously_insured'].value_counts().values)
plt.title('previously_insured')

plt.subplot(2, 3, 2)
sns.barplot(x = insurance_data1['response'].value_counts().index.values, 
            y = insurance_data1['response'].value_counts().values)
plt.title('Response')

plt.subplot(2, 3, 3)
sns.barplot(x = insurance_data1['gender'].value_counts().index.values, 
            y = insurance_data1['gender'].value_counts().values)
plt.title('Gender')

plt.subplot(2, 3, 4)
sns.barplot(x = insurance_data1['vehicle_age'].value_counts().index.values, 
            y = insurance_data1['vehicle_age'].value_counts().values)
plt.title('vehicle_age')

plt.subplot(2, 3, 5)
sns.barplot(x = insurance_data1['vehicle_damage'].value_counts().index.values, 
            y = insurance_data1['vehicle_damage'].value_counts().values)
plt.title('vehicle_damage')

plt.subplot(2, 3, 6)
sns.barplot(x = insurance_data1['driving_license'].value_counts().index.values, 
            y = insurance_data1['driving_license'].value_counts().values)
plt.title('driving_license')
Out[12]:
Text(0.5, 1.0, 'driving_license')
In [13]:
insurance_data1['previously_insured'].value_counts().values
Out[13]:
array([206481, 174628])

2.0. Feature Engineering¶

In [14]:
insurance_data2 = insurance_data1.copy()

2.1. Feature Engineering¶

In [15]:
# insurance_data2['important_sales_channel'] =  insurance_data2['policy_sales_channel'].apply(lambda x: x if x in [26.0, 124.0, 152.0,160.0] else 0)

This feature didn't have relevant impact on the result so I decided to not use it.

3.0. Data Filtering¶

In [16]:
insurance_data3 = insurance_data2.copy()

Nothing to filter

4.0. Exploratory Data Analysis¶

In [17]:
insurance_data4 = insurance_data3.copy()

4.1. Univariate Analysis¶

In [18]:
report = create_report(insurance_data4).show()
  0%|                                                                                                         …
DataPrep Report
DataPrep Report Overview
Variables ≡
id gender age region_code policy_sales_channel previously_insured annual_premium vintage response driving_license vehicle_age vehicle_damage
Interactions Correlations Missing Values

Overview

Dataset Statistics

Number of Variables 12
Number of Rows 381109
Missing Cells 0
Missing Cells (%) 0.0%
Duplicate Rows 0
Duplicate Rows (%) 0.0%
Total Size in Memory 93.9 MB
Average Row Size in Memory 258.5 B
Variable Types
  • Numerical: 6
  • Categorical: 6

Dataset Insights

id is uniformly distributed Uniform
age is skewed Skewed
region_code is skewed Skewed
policy_sales_channel is skewed Skewed
annual_premium is skewed Skewed
previously_insured has constant length 1 Constant Length
response has constant length 1 Constant Length
driving_license has constant length 1 Constant Length

Variables


id

numerical

Approximate Distinct Count 381109
Approximate Unique (%) 100.0%
Missing 0
Missing (%) 0.0%
Infinite 0
Infinite (%) 0.0%
Memory Size 6097744
Mean 190555
Minimum 1
Maximum 381109
Zeros 0
Zeros (%) 0.0%
Negatives 0
Negatives (%) 0.0%
  • id is uniformly distributed

Quantile Statistics

Minimum 1
5-th Percentile 19056.4
Q1 95278
Median 190555
Q3 285832
95-th Percentile 362053.6
Maximum 381109
Range 381108
IQR 190554

Descriptive Statistics

Mean 190555
Standard Deviation 110016.8362
Variance 1.2104e+10
Sum 7.2622e+10
Skewness -8.0711e-18
Kurtosis -1.2
Coefficient of Variation 0.5773
  • id is not normally distributed (p-value 7.259388078140076e-05)

gender

categorical

Approximate Distinct Count 2
Approximate Unique (%) 0.0%
Missing 0
Missing (%) 0.0%
Memory Size 26646561

Length

Mean 4.9185
Standard Deviation 0.9967
Median 4
Minimum 4
Maximum 6

Sample

1st row Male
2nd row Female
3rd row Female
4th row Female
5th row Male

Letter

Count 1874476
Lowercase Letter 1493367
Space Separator 0
Uppercase Letter 381109
Dash Punctuation 0
Decimal Number 0
  • The top 2 categories (Male, Female) take over 50.0%

age

numerical

Approximate Distinct Count 66
Approximate Unique (%) 0.0%
Missing 0
Missing (%) 0.0%
Infinite 0
Infinite (%) 0.0%
Memory Size 6097744
Mean 38.8226
Minimum 20
Maximum 85
Zeros 0
Zeros (%) 0.0%
Negatives 0
Negatives (%) 0.0%
  • age is skewed right (γ1 = 0.6725)

Quantile Statistics

Minimum 20
5-th Percentile 21
Q1 25
Median 36
Q3 49
95-th Percentile 69
Maximum 85
Range 65
IQR 24

Descriptive Statistics

Mean 38.8226
Standard Deviation 15.5116
Variance 240.6101
Sum 1.4796e+07
Skewness 0.6725
Kurtosis -0.5657
Coefficient of Variation 0.3996
  • age is not normally distributed (p-value 7.191893600196644e-12)

region_code

numerical

Approximate Distinct Count 53
Approximate Unique (%) 0.0%
Missing 0
Missing (%) 0.0%
Infinite 0
Infinite (%) 0.0%
Memory Size 6097744
Mean 26.3888
Minimum 0
Maximum 52
Zeros 2021
Zeros (%) 0.5%
Negatives 0
Negatives (%) 0.0%
  • region_code is skewed left (γ1 = -0.1153)

Quantile Statistics

Minimum 0
5-th Percentile 5
Q1 15
Median 28
Q3 35
95-th Percentile 47
Maximum 52
Range 52
IQR 20

Descriptive Statistics

Mean 26.3888
Standard Deviation 13.2299
Variance 175.0299
Sum 1.0057e+07
Skewness -0.1153
Kurtosis -0.8679
Coefficient of Variation 0.5013
  • region_code is not normally distributed (p-value 9.806845814393407e-22)

policy_sales_channel

numerical

Approximate Distinct Count 155
Approximate Unique (%) 0.0%
Missing 0
Missing (%) 0.0%
Infinite 0
Infinite (%) 0.0%
Memory Size 6097744
Mean 112.0343
Minimum 1
Maximum 163
Zeros 0
Zeros (%) 0.0%
Negatives 0
Negatives (%) 0.0%
  • policy_sales_channel is skewed left (γ1 = -0.9)

Quantile Statistics

Minimum 1
5-th Percentile 26
Q1 29
Median 133
Q3 152
95-th Percentile 160
Maximum 163
Range 162
IQR 123

Descriptive Statistics

Mean 112.0343
Standard Deviation 54.204
Variance 2938.073
Sum 4.2697e+07
Skewness -0.9
Kurtosis -0.9708
Coefficient of Variation 0.4838
  • policy_sales_channel is not normally distributed (p-value 1.854436617615345e-16)

previously_insured

categorical

Approximate Distinct Count 2
Approximate Unique (%) 0.0%
Missing 0
Missing (%) 0.0%
Memory Size 25153194

Length

Mean 1
Standard Deviation 0
Median 1
Minimum 1
Maximum 1

Sample

1st row 0
2nd row 1
3rd row 1
4th row 0
5th row 0

Letter

Count 0
Lowercase Letter 0
Space Separator 0
Uppercase Letter 0
Dash Punctuation 0
Decimal Number 381109
  • The top 2 categories (0, 1) take over 50.0%
  • previously_insured has words of constant length

annual_premium

numerical

Approximate Distinct Count 48838
Approximate Unique (%) 12.8%
Missing 0
Missing (%) 0.0%
Infinite 0
Infinite (%) 0.0%
Memory Size 6097744
Mean 30564.3896
Minimum 2630
Maximum 540165
Zeros 0
Zeros (%) 0.0%
Negatives 0
Negatives (%) 0.0%
  • annual_premium is skewed right (γ1 = 1.7661)

Quantile Statistics

Minimum 2630
5-th Percentile 2630
Q1 24405
Median 31669
Q3 39400
95-th Percentile 55176
Maximum 540165
Range 537535
IQR 14995

Descriptive Statistics

Mean 30564.3896
Standard Deviation 17213.1551
Variance 2.9629e+08
Sum 1.1648e+10
Skewness 1.7661
Kurtosis 34.0041
Coefficient of Variation 0.5632
  • annual_premium is not normally distributed (p-value 2.270630572634235e-16)
  • annual_premium has 10320 outliers

vintage

numerical

Approximate Distinct Count 290
Approximate Unique (%) 0.1%
Missing 0
Missing (%) 0.0%
Infinite 0
Infinite (%) 0.0%
Memory Size 6097744
Mean 154.3474
Minimum 10
Maximum 299
Zeros 0
Zeros (%) 0.0%
Negatives 0
Negatives (%) 0.0%
  • vintage is skewed right (γ1 = 0.003)

Quantile Statistics

Minimum 10
5-th Percentile 24
Q1 82
Median 154
Q3 227
95-th Percentile 285
Maximum 299
Range 289
IQR 145

Descriptive Statistics

Mean 154.3474
Standard Deviation 83.6713
Variance 7000.8871
Sum 5.8823e+07
Skewness 0.00303
Kurtosis -1.2007
Coefficient of Variation 0.5421
  • vintage is not normally distributed (p-value 0.000843392300743485)

response

categorical

Approximate Distinct Count 2
Approximate Unique (%) 0.0%
Missing 0
Missing (%) 0.0%
Memory Size 25153194
  • The largest value (0) is over 7.16 times larger than the second largest value (1)

Length

Mean 1
Standard Deviation 0
Median 1
Minimum 1
Maximum 1

Sample

1st row 0
2nd row 0
3rd row 0
4th row 0
5th row 0

Letter

Count 0
Lowercase Letter 0
Space Separator 0
Uppercase Letter 0
Dash Punctuation 0
Decimal Number 381109
  • The top 2 categories (0, 1) take over 50.0%
  • The largest value (0) is over 7.16 times larger than the second largest value (1)
  • response has words of constant length

driving_license

categorical

Approximate Distinct Count 2
Approximate Unique (%) 0.0%
Missing 0
Missing (%) 0.0%
Memory Size 25153194
  • The largest value (1) is over 468.35 times larger than the second largest value (0)

Length

Mean 1
Standard Deviation 0
Median 1
Minimum 1
Maximum 1

Sample

1st row 1
2nd row 1
3rd row 1
4th row 1
5th row 1

Letter

Count 0
Lowercase Letter 0
Space Separator 0
Uppercase Letter 0
Dash Punctuation 0
Decimal Number 381109
  • The top 2 categories (1, 0) take over 50.0%
  • The largest value (1) is over 468.35 times larger than the second largest value (0)
  • driving_license has words of constant length

vehicle_age

categorical

Approximate Distinct Count 3
Approximate Unique (%) 0.0%
Missing 0
Missing (%) 0.0%
Memory Size 27836964

Length

Mean 8.042
Standard Deviation 0.2006
Median 8
Minimum 8
Maximum 9

Sample

1st row < 1 Year
2nd row 1-2 Year
3rd row < 1 Year
4th row < 1 Year
5th row 1-2 Year

Letter

Count 1540443
Lowercase Letter 1159334
Space Separator 561902
Uppercase Letter 381109
Dash Punctuation 200316
Decimal Number 581425
  • The top 2 categories (1-2 Year, < 1 Year) take over 50.0%
  • The largest value (year) is over 1.82 times larger than the second largest value (12)

vehicle_damage

categorical

Approximate Distinct Count 2
Approximate Unique (%) 0.0%
Missing 0
Missing (%) 0.0%
Memory Size 25726716

Length

Mean 2.5049
Standard Deviation 0.5
Median 3
Minimum 2
Maximum 3

Sample

1st row Yes
2nd row No
3rd row No
4th row No
5th row Yes

Letter

Count 954631
Lowercase Letter 573522
Space Separator 0
Uppercase Letter 381109
Dash Punctuation 0
Decimal Number 0
  • The top 2 categories (Yes, No) take over 50.0%

Interactions

Correlations

Missing Values

Report generated with DataPrep

In [19]:
# Remove driving licence

4.2. Bivatiate Analysis¶

4.2.1. Response Variable Relationship Analysis¶

4.2.1.1. Age¶

In [20]:
# Age

plt.figure(figsize = (18, 4))

plt.subplot(1, 3, 1)
sns.boxplot(x = 'response', y = 'age', data = insurance_data4)
plt.title('Age x Response')

plt.subplot(1, 3, 2)
sns.histplot(data = insurance_data4['age'][insurance_data4['response'] == 1] )
plt.title('Age x Response = "Yes"')

plt.subplot(1, 3, 3)
plt.title('Age x Response = "No"')
sns.histplot(data = insurance_data4['age'][insurance_data4['response'] == 0] );
In [21]:
aux = insurance_data4[['age', 'response']]
aux1 = aux.groupby('age').count().reset_index()
aux1['response'] = 1
aux2 = pd.DataFrame(aux.value_counts(['age', 'response'])).reset_index().sort_values('age')
aux3 = pd.DataFrame(aux2.groupby('age').sum()[0]).reset_index()
aux3 = pd.merge(aux2, aux3, how = 'left', on = 'age')
aux3['percentage'] = aux3['0_x'] / aux3['0_y']
In [22]:
# Policy sales channel
plt.figure(figsize = (22, 4 ))

bar1 = sns.barplot(x = 'age', y = 'response', data =  aux1, color = 'darkblue', errorbar =  None)
bar2 = sns.barplot(x = 'age', y = 'percentage', data = aux3[aux3['response'] == 1], color = 'lightblue', errorbar =  None)

# add legend
top_bar = mpatches.Patch(color='darkblue', label='Response = No')
bottom_bar = mpatches.Patch(color='lightblue', label='Response = Yes')
plt.legend(handles=[top_bar, bottom_bar])
plt.ylim(0, 1)
plt.xticks(rotation= 90, size = 7)
plt.title('Percentage Responses by Age')

# show the graph
plt.show()

4.2.1.2. Annual Premium¶

In [23]:
# Annual premium

plt.figure(figsize = (18, 4))
sns.boxplot(x = 'response', y = 'annual_premium', data = insurance_data4[insurance_data4['annual_premium'] < 100000])
plt.title('annual premium x Response')

plt.subplot(1, 3, 2)
sns.distplot(insurance_data4[insurance_data4['response'] == 1]['annual_premium'], kde = False )

plt.subplot(1, 3, 2)
sns.distplot(insurance_data4[insurance_data4['response'] == 0]['annual_premium'], kde = False )
plt.title('Response: Yes x No')

plt.subplot(1, 3, 3)
aux = insurance_data4[(insurance_data4['annual_premium'] < 100000) & (insurance_data4['annual_premium'] > 15000)]
aux1 = aux[aux['response'] == 1]
sns.histplot(data = aux1['annual_premium'] )

plt.subplot(1, 3, 3)
aux2 = aux[aux['response'] == 0]
sns.histplot(data = aux2['annual_premium'])
plt.title('Response: Yes x No');

4.2.1.3. Region Code¶

In [24]:
# Region code
plt.figure(figsize = (6, 4))
aux = insurance_data4[['region_code', 'id', 'response']].groupby(['region_code','response']).count().reset_index()
sns.scatterplot(data = aux, x = 'region_code', y = 'id', hue = 'response')
Out[24]:
<AxesSubplot: xlabel='region_code', ylabel='id'>

4.2.1.4. Gender¶

In [25]:
# Gender
aux = insurance_data4[['id', 'gender','response']].groupby(['gender','response']).count().reset_index()
aux['total_gender'] = aux.apply(lambda x: sum(aux['id'][aux['gender'] == 'Female']) if x['gender'] == 'Female' 
                                                                                    else sum(aux['id'][aux['gender'] == 'Male']), axis = 1)
aux['total_gender'] = aux['id'] / aux['total_gender']
In [26]:
plt.figure(figsize = (6, 4))

sns.barplot(data = aux, x = 'gender', y = 'total_gender', hue = 'response')
plt.title('Response proportion by Gender');

4.2.1.5. Previously Insured¶

In [27]:
# Previously insured
aux = insurance_data4[['id', 'previously_insured','response']].groupby(['previously_insured','response']).count().reset_index()
aux['total_previously_insured'] = aux.apply(lambda x: sum(aux['id'][aux['previously_insured'] == 0]) if x['previously_insured'] == 0 
                                                                                                     else sum(aux['id'][aux['previously_insured'] == 1]), axis = 1)
aux['total_previously_insured'] = aux['id'] / aux['total_previously_insured']
plt.figure(figsize = (6, 4))

sns.barplot(data = aux, x = 'previously_insured', y = 'total_previously_insured', hue = 'response')
plt.title('Response proportion by Previously Insured');
In [28]:
aux
Out[28]:
previously_insured response id total_previously_insured
0 0 0 159929 0.774546
1 0 1 46552 0.225454
2 1 0 174470 0.999095
3 1 1 158 0.000905

Data leakage?

4.2.1.6. Vehichle Age¶

In [29]:
aux = insurance_data4[['vehicle_age', 'response']]
aux1 = aux.groupby('vehicle_age').count().reset_index()
aux1['response'] = 1
aux2 = pd.DataFrame(aux.value_counts(['vehicle_age', 'response'])).reset_index().sort_values('vehicle_age')
aux3 = pd.DataFrame(aux2.groupby('vehicle_age').sum()[0]).reset_index()
aux3 = pd.merge(aux2, aux3, how = 'left', on = 'vehicle_age')
aux3['percentage'] = aux3['0_x'] / aux3['0_y']
In [30]:
# Policy sales channel
plt.figure(figsize = (6, 4 ))

bar1 = sns.barplot(x = 'vehicle_age', y = 'response', data =  aux1, color = 'darkblue', errorbar =  None)
bar2 = sns.barplot(x = 'vehicle_age', y = 'percentage', data = aux3[aux3['response'] == 1], color = 'lightblue', errorbar =  None)

# add legend
top_bar = mpatches.Patch(color='darkblue', label='Response = No')
bottom_bar = mpatches.Patch(color='lightblue', label='Response = Yes')
plt.legend(handles=[top_bar, bottom_bar])
plt.ylim(0, 1.25)
plt.xticks(rotation= 90, size = 7)
plt.title('Percentage Responses by Vehicle Age')

# show the graph
plt.show()

4.2.1.7. Policy Sales Channel¶

In [31]:
aux = insurance_data4[['policy_sales_channel', 'response']]
aux1 = aux.groupby('policy_sales_channel').count().reset_index()
aux1['response'] = 1
aux2 = pd.DataFrame(aux.value_counts(['policy_sales_channel', 'response'])).reset_index().sort_values('policy_sales_channel')
aux3 = pd.DataFrame(aux2.groupby('policy_sales_channel').sum()[0]).reset_index()
aux3 = pd.merge(aux2, aux3, how = 'left', on = 'policy_sales_channel')
aux3['percentage'] = aux3['0_x'] / aux3['0_y']
In [32]:
# Policy sales channel
plt.figure(figsize = (22, 4 ))

bar1 = sns.barplot(x = 'policy_sales_channel', y = 'response', data =  aux1, color = 'darkblue', errorbar =  None)
bar2 = sns.barplot(x = 'policy_sales_channel', y = 'percentage', data = aux3[aux3['response'] == 1], color = 'lightblue', errorbar =  None)

# add legend
top_bar = mpatches.Patch(color='darkblue', label='Response = No')
bottom_bar = mpatches.Patch(color='lightblue', label='Response = Yes')
plt.legend(handles=[top_bar, bottom_bar])
plt.ylim(0, 1)
plt.xticks(rotation= 90, size = 7)
plt.title('Percentage Responses by Policy Sales Channel')

# show the graph
plt.show()

4.2.1.8. Vintage¶

In [33]:
aux = insurance_data4[['vintage', 'response']]
aux1 = aux.groupby('vintage').count().reset_index()
aux1['response'] = 1
aux2 = pd.DataFrame(aux.value_counts(['vintage', 'response'])).reset_index().sort_values('vintage')
aux3 = pd.DataFrame(aux2.groupby('vintage').sum()[0]).reset_index()
aux3 = pd.merge(aux2, aux3, how = 'left', on = 'vintage')
aux3['percentage'] = aux3['0_x'] / aux3['0_y']
In [34]:
# Policy sales channel
plt.figure(figsize = (22, 4 ))

bar1 = sns.barplot(x = 'vintage', y = 'response', data =  aux1, color = 'darkblue', errorbar =  None)
bar2 = sns.barplot(x = 'vintage', y = 'percentage', data = aux3[aux3['response'] == 1], color = 'lightblue', errorbar =  None)

# add legend
top_bar = mpatches.Patch(color='darkblue', label='Response = No')
bottom_bar = mpatches.Patch(color='lightblue', label='Response = Yes')
plt.legend(handles=[top_bar, bottom_bar])
plt.ylim(0, 1)
plt.xticks(rotation= 90, size = 7)
plt.title('Percentage Responses by Vintage')

# show the graph
plt.show()
In [35]:
plt.figure(figsize = (5, 4 ))
sns.boxplot(x = 'response', y = 'vintage', data = insurance_data4)
plt.title('Vintage distribution by Response');

No information from this feature. Delete?

4.2.1.9. Vehicle Damage¶

In [36]:
aux = insurance_data4[['vehicle_damage', 'response']]
aux1 = aux.groupby('vehicle_damage').count().reset_index()
aux1['response'] = 1
aux2 = pd.DataFrame(aux.value_counts(['vehicle_damage', 'response'])).reset_index().sort_values('vehicle_damage')
aux3 = pd.DataFrame(aux2.groupby('vehicle_damage').sum()[0]).reset_index()
aux3 = pd.merge(aux2, aux3, how = 'left', on = 'vehicle_damage')
aux3['percentage'] = aux3['0_x'] / aux3['0_y']
In [37]:
# Policy sales channel
plt.figure(figsize = (5, 4 ))

bar1 = sns.barplot(x = 'vehicle_damage', y = 'response', data =  aux1, color = 'darkblue', errorbar =  None)
bar2 = sns.barplot(x = 'vehicle_damage', y = 'percentage', data = aux3[aux3['response'] == 1], color = 'lightblue', errorbar =  None)

# add legend
top_bar = mpatches.Patch(color='darkblue', label='Response = No')
bottom_bar = mpatches.Patch(color='lightblue', label='Response = Yes')
plt.legend(handles=[top_bar, bottom_bar])
plt.ylim(0, 1.25)
plt.xticks(rotation= 90, size = 7)
plt.title('Percentage Responses by Vehicle Damage')

# show the graph
plt.show()

4.2.1. Other Variable Relationship Analysis¶

4.2.1.1. Previously insured X Policy Sales Channel¶

In [38]:
aux = insurance_data4.groupby('policy_sales_channel').count()['id'].reset_index()
aux = pd.merge(insurance_data4, aux, on = 'policy_sales_channel', how = 'left')
plt.figure(figsize = (18,6))
plt.title('Previously insured X Policy Sales Channel')
sns.countplot(data = aux[aux['id_y'] > 500], x = 'policy_sales_channel', hue = 'previously_insured');

4.2.1.2. Previously insured X Vehicle damage¶

In [39]:
plt.figure(figsize = (6,4))
plt.title('Previously insured X Vehicle Damage')
sns.countplot(data = insurance_data4, x = 'previously_insured', hue = 'vehicle_damage');

4.2.1.3. Vehicle age X Previously insured¶

In [40]:
plt.figure(figsize = (6,4))
plt.title('Vehicle Age X Previously insured')
sns.countplot(data = insurance_data4, x = 'vehicle_age', hue = 'previously_insured');

4.2.1.4. Policy Sales Channel X Vehicle age¶

In [41]:
aux = insurance_data4.groupby('policy_sales_channel').count()['id'].reset_index()
aux = pd.merge(insurance_data4, aux, on = 'policy_sales_channel', how = 'left')
plt.figure(figsize = (18,6))
plt.title('Policy Sales Channel X Vehicle Age')
sns.countplot(data = aux[aux['id_y'] > 1000], x = 'policy_sales_channel', hue = 'vehicle_age');

4.2.1.5. Policy Sales Channel X Vehicle damage¶

In [42]:
aux = insurance_data4.groupby('policy_sales_channel').count()['id'].reset_index()
aux = pd.merge(insurance_data4, aux, on = 'policy_sales_channel', how = 'left')
plt.figure(figsize = (18,6))
plt.title('Policy Sales Channel X Vehicle Damage')
sns.countplot(data = aux[aux['id_y'] > 1000], x = 'policy_sales_channel', hue = 'vehicle_damage');

4.3. Multivariate Analysis¶

In [43]:
# Continuous variables
plt.figure(figsize = (6,4))
corr_heatmap = insurance_data4[['age','annual_premium', 'vintage']].corr()
heatmap = sns.heatmap(corr_heatmap, vmin=-1, vmax=1, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap');
In [44]:
# Categorical variables
# Calculate cramer V
cat_attributes = ['gender','region_code', 'policy_sales_channel', 'previously_insured', 'response', 'vehicle_age', 'vehicle_damage']

cramer_matrix = []
for i in cat_attributes:
    cramer_list = []
    for j in cat_attributes:
        cramer_list.append(cramer_v(insurance_data4[i], insurance_data4[j]))
    cramer_matrix.append(cramer_list)
        
        
cat_corr = pd.DataFrame({'gender': cramer_matrix[0],
                         'region_code': cramer_matrix[1],
                         'policy_sales_channel': cramer_matrix[2],
                         'previously_insured': cramer_matrix[3],
                         'response': cramer_matrix[4],
                         'vehicle_age': cramer_matrix[5],
                         'vehicle_damage': cramer_matrix[6]})

cat_corr = cat_corr.set_index(cat_corr.columns)

plt.figure(figsize=(6, 4) )
plt.title('Correlation Heatmap', fontdict={'fontsize':13}, pad=12);
sns.heatmap(cat_corr, vmin=0, vmax=1, annot=True, cmap='Blues');

5.0. Data Preparation¶

In [45]:
insurance_data5 = insurance_data4.copy()

5.1. Fillout NA¶

In [46]:
# No missing values

5.2. Standardization¶

In [47]:
ss = StandardScaler()

# Annual premium
insurance_data5['annual_premium'] = ss.fit_transform(insurance_data5[['annual_premium']].values )

5.3. Rescaling¶

In [48]:
mms_age = MinMaxScaler()
mms_vintage = MinMaxScaler()

# Age
insurance_data5['age'] = mms_age.fit_transform(insurance_data5[['age']].values )

# Vintage
insurance_data5['vintage'] = mms_vintage.fit_transform(insurance_data5[['vintage']].values )

5.4. Encoding¶

In [49]:
# Gender
insurance_data5['gender'] = insurance_data5['gender'].apply(lambda x: 0 if x == 'Male' else 1)

# Vehicle damage
insurance_data5['vehicle_damage'] = insurance_data5['vehicle_damage'].apply(lambda x: 0 if x == 'No' else 1)

# Vehicle age
insurance_data5['vehicle_age'] = insurance_data5['vehicle_age'].apply(lambda x: 0 if x == '< 1 Year' else 1 if x == '1-2 Year' else 2)

# region_code - Frequency Encoding / Target Encoding / Weighted Target Encoding
# Target Encoding
target_encode_region_code = insurance_data5.groupby('region_code')['response'].mean()
insurance_data5['region_code'] = insurance_data5['region_code'].map( target_encode_region_code )

# policy sales channel - Frequency Encoding / Target Encoding / Weighted Target Encoding
# Frequency encoding
target_encode_policy_sales = insurance_data5.groupby('policy_sales_channel')['response'].count() / len(insurance_data5)
insurance_data5['policy_sales_channel'] = insurance_data5['policy_sales_channel'].map( target_encode_policy_sales )

6.0. Feature Selection¶

In [50]:
insurance_data6 = insurance_data5.copy()

6.1. Manual Feature Selection¶

In [51]:
insurance_data6 = insurance_data6.drop(['driving_license', 'id'], axis = 1)

6.2. Extra Trees Feature Selection¶

In [52]:
X_train_n = insurance_data6.drop('response', axis = 1)
y_train_n = insurance_data6['response'].values

# Model definition
extra_trees = ExtraTreesClassifier( n_estimators = 250, n_jobs = -1 )

# Training model
extra_trees.fit( X_train_n, y_train_n )

# Get feature importances
importances = extra_trees.feature_importances_

# Load std of estimators feature importances
std = np.std( [tree.feature_importances_ for tree in extra_trees.estimators_], axis = 0 )

# Sort list of feature importance
indices = np.argsort( importances )[::-1]

# Print feature ranking
print('Feature Ranking: ')
df = pd.DataFrame()
for i, j in zip(X_train_n, extra_trees.feature_importances_):
    aux = pd.DataFrame( {'feature': i, 'importance': j}, index = [0])
    df = pd.concat( [ df, aux], axis = 0 )
    df = df.sort_values( 'importance', ascending = False ) 
print( df )
Feature Ranking: 
                feature  importance
0               vintage    0.284470
0        annual_premium    0.252330
0                   age    0.152346
0           region_code    0.102225
0        vehicle_damage    0.076358
0  policy_sales_channel    0.057825
0    previously_insured    0.053160
0           vehicle_age    0.016250
0                gender    0.005036
In [53]:
# Plot impurity-based feature importances of the forest
plt.figure(figsize = (6, 4) )
plt.title('Feature Importances')
plt.bar( range (X_train_n.shape[1]), importances[indices], color = "r", yerr = std[indices], align = "center")
plt.xticks( range( X_train_n.shape[1]), df['feature'].values, rotation = 90 )
plt.xlim( [ -1, X_train_n.shape[1] ] )
plt.show()

6.3. Final feature Selection¶

In [54]:
insurance_data6 = insurance_data6.drop( ['gender', 'vehicle_age'], axis = 1 )

7.0. Machine Learning Modelling¶

In [55]:
insurance_data7 = insurance_data6.copy()
In [56]:
y = insurance_data4['response']
X = insurance_data4.drop('response', axis = 1)
In [57]:
# Define Weights
weights = insurance_data5['response'].value_counts(normalize = True).values
weight = {0: weights[1], 1: weights[0]}

7.1. Models¶

7.1.1. Logistic Regression¶

In [58]:
lr_model = LogisticRegression( class_weight = weight, random_state = 42 )

lr_scores = cross_validation(X = X, 
                             y = y, 
                             model = lr_model, 
                             model_name = 'Logistic Regression',
                             test_size = 0.3, 
                             cv = 10, 
                             verbose = False)

lr_scores
Out[58]:
Model Name Precision Top-K CV Recall Top-K CV F1 Score CV AUC Score CV
0 Logistic Regression 0.12 +/- 0.024 0.02 +/- 0.0034 0.35 +/- 0.0733 0.72 +/- 0.0947
In [59]:
cross_validation(X = X, 
                 y = y, 
                 model = lr_model, 
                 model_name = 'Logistic Regression',
                 test_size = 0.3, 
                 cv = 1, 
                 verbose = True)
Precision top K:  0.142  | Recall Top K:  0.02006783493499152 0.4  | AUC Score:  0.783
Confusion Matrix:
 [[59086 41095]
 [  331 13821]]
Out[59]:
Model Name Precision Top-K CV Recall Top-K CV F1 Score CV AUC Score CV
0 Logistic Regression 0.14 +/- 0.0 0.02 +/- 0.0 0.4 +/- 0.0 0.78 +/- 0.0
<Figure size 400x400 with 0 Axes>

7.1.2. K-Nearest Neighbors (KNN)¶

In [60]:
knn_model = KNeighborsClassifier( n_neighbors = 5, weights = 'distance'  ) 

knn_scores = cross_validation(X = X, 
                              y = y, 
                              model = knn_model,
                              model_name = 'K-Nearest Neighbors',
                              test_size = 0.3, 
                              cv = 10, 
                              verbose = False)

knn_scores
Out[60]:
Model Name Precision Top-K CV Recall Top-K CV F1 Score CV AUC Score CV
0 K-Nearest Neighbors 0.24 +/- 0.0071 0.03 +/- 0.001 0.12 +/- 0.0028 0.6 +/- 0.0024
In [61]:
knn_model = KNeighborsClassifier( n_neighbors = 3, weights = 'distance'  ) 
cross_validation(X = X, 
                 y = y, 
                 model = knn_model,
                 model_name = 'K-Nearest Neighbors',
                 test_size = 0.3, 
                 cv = 1, 
                 verbose = True)
Precision top K:  0.229  | Recall Top K:  0.03282683486238532 0.15  | AUC Score:  0.577
Confusion Matrix:
 [[93808  6573]
 [12291  1661]]
Out[61]:
Model Name Precision Top-K CV Recall Top-K CV F1 Score CV AUC Score CV
0 K-Nearest Neighbors 0.23 +/- 0.0 0.03 +/- 0.0 0.15 +/- 0.0 0.58 +/- 0.0
<Figure size 400x400 with 0 Axes>

7.1.3. Random Forest¶

In [62]:
rf_model = RandomForestClassifier(n_estimators = 100, max_depth = 6, random_state = 42, class_weight = weight) 

rf_scores = cross_validation(X = X, 
                             y = y, 
                             model = rf_model, 
                             model_name = 'Random Forest',
                             test_size = 0.3, 
                             cv = 10, 
                             verbose = False)
rf_scores
Out[62]:
Model Name Precision Top-K CV Recall Top-K CV F1 Score CV AUC Score CV
0 Random Forest 0.39 +/- 0.0138 0.06 +/- 0.0017 0.42 +/- 0.0024 0.85 +/- 0.0013
In [63]:
cross_validation(X = X, 
                 y = y, 
                 model = rf_model, 
                 model_name = 'Random Forest',
                 test_size = 0.3, 
                 cv = 1, 
                 verbose = True)
Precision top K:  0.3935  | Recall Top K:  0.0558195616710405 0.423  | AUC Score:  0.848
Confusion Matrix:
 [[64992 35242]
 [  857 13242]]
Out[63]:
Model Name Precision Top-K CV Recall Top-K CV F1 Score CV AUC Score CV
0 Random Forest 0.39 +/- 0.0 0.06 +/- 0.0 0.42 +/- 0.0 0.85 +/- 0.0
<Figure size 400x400 with 0 Axes>

7.1.4. XGBoost¶

In [64]:
xgb_model = XGBClassifier(n_estimators = 100, max_depth = 6, random_state = 42, scale_pos_weight = 8)

xgb_scores = cross_validation(X = X, 
                              y = y, 
                              model = xgb_model, 
                              model_name = 'XGBoost',
                              test_size = 0.3, 
                              cv = 10, 
                              verbose = False)

xgb_scores
Out[64]:
Model Name Precision Top-K CV Recall Top-K CV F1 Score CV AUC Score CV
0 XGBoost 0.43 +/- 0.0102 0.06 +/- 0.0014 0.43 +/- 0.0019 0.85 +/- 0.0009
In [65]:
xgb_model = XGBClassifier(n_estimators = 100, max_depth = 6, random_state = 42, scale_pos_weight = 8)
cross_validation(X = X, 
                 y = y, 
                 model = xgb_model, 
                 model_name = 'XGBoost',
                 test_size = 0.3, 
                 cv = 1, 
                 verbose = True)
Precision top K:  0.4135  | Recall Top K:  0.058911525858384384 0.432  | AUC Score:  0.852
Confusion Matrix:
 [[67817 32478]
 [ 1211 12827]]
Out[65]:
Model Name Precision Top-K CV Recall Top-K CV F1 Score CV AUC Score CV
0 XGBoost 0.41 +/- 0.0 0.06 +/- 0.0 0.43 +/- 0.0 0.85 +/- 0.0
<Figure size 400x400 with 0 Axes>

7.2. Comparing Models Performance¶

In [93]:
models = ['Logistic Regression', 'K-Nearest Neighbors', 'Random Forest', 'XGBoost']
models_performance = pd.concat([lr_scores, knn_scores, rf_scores, xgb_scores])
models_performance['Model'] = models
models_performance.set_index('Model').sort_values('Precision Top-K CV', ascending = False).set_index('Model Name')
Out[93]:
Precision Top-K CV Recall Top-K CV F1 Score CV AUC Score CV
Model Name
XGBoost 0.43 +/- 0.0102 0.06 +/- 0.0014 0.43 +/- 0.0019 0.85 +/- 0.0009
Random Forest 0.39 +/- 0.0138 0.06 +/- 0.0017 0.42 +/- 0.0024 0.85 +/- 0.0013
K-Nearest Neighbors 0.24 +/- 0.0071 0.03 +/- 0.001 0.12 +/- 0.0028 0.6 +/- 0.0024
Logistic Regression 0.12 +/- 0.024 0.02 +/- 0.0034 0.35 +/- 0.0733 0.72 +/- 0.0947

7.3. Selected Model¶

XGBosst: Similar Results compared to Random Forest Model, but 2x faster.

8.0. Hyperparameter Fine Tuning¶

8.1. Optuna¶

In [67]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.3)

# Preprocess data
X_train, X_test, y_train, y_test = preprocessing(X_train, X_test, y_train, y_test)


def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'scale_pos_weight': trial.suggest_int('scale_pos_weight',6,10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'map',#'mlogloss',
        'use_label_encoder': False
    }

    # Fit the model
    optuna_model = XGBClassifier(**params)
    optuna_model.fit(X_train, y_train)

    # Make predictions
    y_pred = optuna_model.predict_proba(X_test)

    # Evaluate predictions
    precision_top_k = score_top_k(y_test, y_pred, 2000)['precision']
    
    return precision_top_k

# study = optuna.create_study(direction='maximize')
# #study.optimize(objective, n_trials = 100 )

# print('Number of finished trials: {}'.format(len(study.trials)))
# print('Best trial:')
# trial = study.best_trial

# print('  Value: {}'.format(trial.value))
# print('  Params: ')

# for key, value in trial.params.items():
#     print('    {}: {}'.format(key, value))

8.2. Tuned Model¶

In [68]:
# Parameters set with best score
#params = trial.params
params = {'max_depth': 8,
         'learning_rate': 0.014416504517463456,
         'n_estimators': 499,
         'min_child_weight': 10,
         'scale_pos_weight': 10,
         'gamma': 0.5834982442488319,
         'subsample': 0.3703846362615623,
         'colsample_bytree': 0.6808135028844782,
         'reg_alpha': 1.001572471362163e-05,
         'reg_lambda': 1.1690461872764534e-05}

model = XGBClassifier(**params)

scores = cross_validation(X = X, 
                          y = y, 
                          model = model, 
                          model_name = 'XGBoost',
                          test_size = 0.3, 
                          cv = 10, 
                          verbose = False)
scores
Out[68]:
Model Name Precision Top-K CV Recall Top-K CV F1 Score CV AUC Score CV
0 XGBoost 0.43 +/- 0.0092 0.06 +/- 0.0014 0.42 +/- 0.0022 0.85 +/- 0.0008

9.0. Error Interpretation¶

In [69]:
insurance_data9 = insurance_data7.copy()

9.1. Model Performance¶

9.2. Cumulative Gain Curve¶

It shows the relationship between the number of customers contacted on the list ordered by the algorithm and the success rate. The closer the cumulative gains line is to the top-left corner of the chart, the greater the gain, the higher the proportion of the responders that are reached for the lower proportion of customers contacted.

In [70]:
skplt.metrics.plot_cumulative_gain(y_test, model.predict_proba(X_test), figsize = (6,6));

The optimum point is about 30% of the candidates, so we can reach 80% of the customers interested in the insurance.

9.3. Lift Curve¶

It shows how good the model is compared to a random one, according to the number of customers contacted. Ex: Lift = 2 means that the model is two times better than a random one.

.

In [73]:
skplt.metrics.plot_lift_curve(y_test, model.predict_proba(X_test), figsize = (6,6));

If we call about 30% of the available customers, the model will be about 2.7 times better than calling randomly.

9.2. Business performance¶

  • The average price per sales call is around 2 dollars, depending on factors like call duration.
  • The average price of health insurance is about $ 5280.00 depending on factors such as age, type of plan chosen, level of coverage, size of provider network, among others.
  • The company has a list with 127K of customers to call and offer the health insurance.
In [89]:
predict_proba = model.predict_proba(X_test)
total_customers = 127000.0

precision = [score_top_k(y_test, predict_proba, total_customers * 0.05)['precision'].item(),
             score_top_k(y_test, predict_proba, total_customers * 0.1 )['precision'].item(),
             score_top_k(y_test, predict_proba, total_customers * 0.2 )['precision'].item(),
             score_top_k(y_test, predict_proba, total_customers * 0.3 )['precision'].item(),
             score_top_k(y_test, predict_proba, total_customers * 0.4 )['precision'].item(),
             score_top_k(y_test, predict_proba, total_customers * 0.5 )['precision']]

recall = [score_top_k(y_test, predict_proba, total_customers * 0.05)['recall'].item(),
          score_top_k(y_test, predict_proba, total_customers * 0.1 )['recall'].item(),
          score_top_k(y_test, predict_proba, total_customers * 0.2 )['recall'].item(),
          score_top_k(y_test, predict_proba, total_customers * 0.3 )['recall'].item(),
          score_top_k(y_test, predict_proba, total_customers * 0.4 )['recall'].item(),
          score_top_k(y_test, predict_proba, total_customers * 0.5 )['recall']]

percentage_customers = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

no_customers = [( percentage_customers[0] * total_customers ) ,
                ( percentage_customers[1] * total_customers ) ,
                ( percentage_customers[2] * total_customers ) ,
                ( percentage_customers[3] * total_customers ) ,
                ( percentage_customers[4] * total_customers ) ,
                ( percentage_customers[5] * total_customers ) ]

recall_rand =  [( 0.123 * no_customers[0] ) / (0.123 * total_customers),
                ( 0.123 * no_customers[1] ) / (0.123 * total_customers),
                ( 0.123 * no_customers[2] ) / (0.123 * total_customers),
                ( 0.123 * no_customers[3] ) / (0.123 * total_customers),
                ( 0.123 * no_customers[4] ) / (0.123 * total_customers),
                ( 0.123 * no_customers[5] ) / (0.123 * total_customers)]

 
investment =  [no_customers[0] * 2,
               no_customers[1] * 2,
               no_customers[2] * 2,
               no_customers[3] * 2,
               no_customers[4] * 2,
               no_customers[5] * 2]
 
roi_model =  [ 5280.0 * 0.05 * total_customers * precision[0],
               5280.0 * 0.1  * total_customers * precision[1],
               5280.0 * 0.2  * total_customers * precision[2],
               5280.0 * 0.3  * total_customers * precision[3],
               5280.0 * 0.4 * total_customers * precision[4],
               5280.0 * 0.5 * total_customers * precision[5]]

roi_rand = [5280.0 * 0.05 * total_customers * 0.123, 
            5280.0 * 0.1  * total_customers * 0.123, 
            5280.0 * 0.2  * total_customers * 0.123, 
            5280.0 * 0.3  * total_customers * 0.123, 
            5280.0 * 0.4  * total_customers * 0.123, 
            5280.0 * 0.5  * total_customers * 0.123 ]

df_info = pd.DataFrame([np.round(percentage_customers,2),
                       np.round(no_customers,0),
                       np.round(investment, 2), 
                       np.round(np.round(roi_rand, 2 ) / 1000000, 2),
                       np.round(recall_rand, 2), 
                       np.round(np.round(roi_model, 2) / 1000000, 2),
                       np.round(recall, 2)]).T

df_info.columns = ['percentage customers', 'No. Customers', 'Investment ($)', 
                   'Random Calls Income ($)', 'Random Model Recall','ML Model Income($)', 'ML Model Recall']

df_info['Random Calls Income ($)'] = df_info['Random Calls Income ($)'].apply(lambda x: "$ " + str(x) + "M")
df_info['ML Model Income ($)'] = df_info['ML Model Income($)'].apply(lambda x: "$ " + str(x) + "M")

df_info
Out[89]:
percentage customers No. Customers Investment ($) Random Calls Income ($) Random Model Recall ML Model Income($) ML Model Recall ML Model Income ($)
0 0.05 6350.0 12700.0 $ 4.12M 0.05 14.29 0.19 $ 14.29M
1 0.10 12700.0 25400.0 $ 8.25M 0.10 26.25 0.36 $ 26.25M
2 0.20 25400.0 50800.0 $ 16.5M 0.20 46.91 0.64 $ 46.91M
3 0.30 38100.0 76200.0 $ 24.74M 0.30 62.55 0.85 $ 62.55M
4 0.40 50800.0 101600.0 $ 32.99M 0.40 71.07 0.97 $ 71.07M
5 0.50 63500.0 127000.0 $ 41.24M 0.50 73.45 1.00 $ 73.45M

9.3. Business Questions¶

What percentage of customers interested in health insurance the sales team will be able to reach if they make 20000 calls?

In [75]:
print(np.round(score_top_k(y_test, predict_proba, 20000)['recall'].item(),2))
0.52

What percentage of customers interested in health insurance the sales team will be able to reach if they make 40000 calls?

In [77]:
print(np.round(score_top_k(y_test, predict_proba, 40000)['recall'].item(),2))
0.88

How many phone calls the sales team have to make to reach 80% of the interested customers?

In [76]:
print("35000 calls:", np.round(score_top_k(y_test, predict_proba, 35000)['recall'].item(),1) * 100, "% of the customers.")
35000 calls: 80.0 % of the customers.

10.0. Model Deployment¶

10.1. Train Final Model With all Data¶

In [78]:
X = insurance_data4.drop('response', axis = 1)
y = insurance_data4['response']

######## Preprocessing ##########

# Vehicle damage
X['vehicle_damage'] = X['vehicle_damage'].apply(lambda x: 0 if x == 'No' else 1)

# region_code 
aux = X.copy()
aux['response'] = y
target_encoding_region_code = aux.groupby('region_code')['response'].mean()
target_encoding_region_code.to_csv('../src/parameter/target_encoding_region_code.csv', index = True )
X['region_code'] = X['region_code'].map( target_encoding_region_code )
X['region_code'].fillna(0, inplace = True)

# policy sales channel
frequency_encoding_policy_sales = aux.groupby('policy_sales_channel')['response'].count() / len(insurance_data5)
frequency_encoding_policy_sales.to_csv('../src/parameter/frequency_encoding_policy_sales.csv', index = True )
X['policy_sales_channel'] = X['policy_sales_channel'].map( frequency_encoding_policy_sales )
X['policy_sales_channel'].fillna(0, inplace = True)

# Feature selection
X = X[['age', 'region_code', 'policy_sales_channel', 'previously_insured', 'annual_premium', 'vintage', 'vehicle_damage']]

######## Create ML Model ##########

params = {'max_depth': 9,
          'learning_rate': 0.014331889541297329,
          'n_estimators': 313,
          'min_child_weight': 5,
          'scale_pos_weight': 8,
          'gamma': 2.3488317973981494e-06,
          'subsample': 0.07868170019741563,
          'colsample_bytree': 0.8875847384272901,
          'reg_alpha': 0.00019823852240508522,
          'reg_lambda': 1.2406673503434204e-07}

# Train model
xgb_model = XGBClassifier(**params).fit(X, y)

# Save model pkl
pickle.dump( xgb_model, open( '../src/model/xgb_model.pkl', 'wb' ) )

10.2. Health Insurance Class¶

In [79]:
import pandas as pd

class HealthInsurance:
    
    def __init__(self):
        self.region_code_path = '../src/parameter/target_encoding_region_code.csv'
        self.policy_sales_path = '../src/parameter/frequency_encoding_policy_sales.csv'
        self.age_mms = pickle.load(open('../src/parameter/mms_age.pkl', 'rb'))
        self.vintage_mms = pickle.load(open('../src/parameter/mms_vintage.pkl','rb'))
        self.annual_premium_ss = pickle.load(open('../src/parameter/ss_annual_premium.pkl', 'rb'))
        
    def preprocessing(self):
        
        # Age
        X['age'] = self.age_mms.transform(X[['age']].values )
        
        # Vintage
        X['vintage'] = self.vintage_mms.transform(X[['vintage']].values )
        
        # Annual premium
        X['annual_premium'] = self.annual_premium_ss.transform(X[['annual_premium']].values )
        
        # Vehicle damage
        X = self.copy()
        X['vehicle_damage'] = X['vehicle_damage'].apply(lambda x: 0 if x == 'No' else 1)

        # region_code 
        target_encoding_region_code = pd.read_csv(self.region_code_path).set_index('region_code')['response']
        X['region_code'] = X['region_code'].map( target_encoding_region_code )
        X['region_code'].fillna(0, inplace = True)

        # policy sales channel
        frequency_encoding_policy_sales = pd.read_csv(self.policy_sales_path).set_index('policy_sales_channel')['response']
        X['policy_sales_channel'] = X['policy_sales_channel'].map( frequency_encoding_policy_sales )
        X['policy_sales_channel'].fillna(0, inplace = True)

        # Feature selection
        X = X[['age', 'region_code', 'policy_sales_channel', 'previously_insured', 'annual_premium', 'vintage', 'vehicle_damage']]   
        
    def get_prediction( self, model, original_data, preocessed_data):
        predictions = model.predict_proba(processed_data)[::,1]
        
        # join predictions into the original data
        original_data['predictions'] = predictions
        original_data.sort_values( 'predictions', ascending = False )